3  Results

Code
library(viridis)
library(leaflet)
library(ggplot2)
library(dplyr)
library(tidyr)
library(naniar)
library(ggmap)
library(lubridate)
library(sf)
library(dygraphs)
library(xts)
library(ggiraph)
library(sf)
library(ggalluvial)
library(patchwork)
library(vcd)
library(forcats)
library(RColorBrewer)
library(devtools)
library(jsonlite)
# Read in data
data <- read.csv("NYPD_Arrest_Data.csv", na.strings = c("(null)", "N/A"))
# Data Preprocessing step
data <- na.omit(data)
data$ARREST_DATE <- as.Date(data$ARREST_DATE, format = "%m/%d/%Y")
data <- data |>
  mutate(ARREST_BORO = case_when(
    ARREST_BORO == "B" ~ "Bronx",
    ARREST_BORO == "S" ~ "Staten Island",
    ARREST_BORO == "K" ~ "Brooklyn",
    ARREST_BORO == "M" ~ "Manhattan",
    ARREST_BORO == "Q" ~ "Queens"
    )) |>
  mutate(LAW_CAT_CD = case_when(
    LAW_CAT_CD == "F" ~ "Felony",
    LAW_CAT_CD == "M" ~ "Misdemeanor",
    LAW_CAT_CD == "V" ~ "Violation",
    .default = NA
  )) |>
  mutate(JURISDICTION_CODE = case_when(
    JURISDICTION_CODE == 0 ~ "Patrol",
    JURISDICTION_CODE == 1 ~ "Transit",
    JURISDICTION_CODE == 2 ~ "Housing",
    .default = "Non NYPD"
  )) |>
  mutate(PERP_SEX = case_when(
    PERP_SEX == "F" ~ "Female",
    PERP_SEX == "M" ~ "Male"
  )) |>
  mutate(Latitude = as.numeric(Latitude),
         Longitude = as.numeric(Longitude)
  ) |> 
  mutate(Weekday = wday(ARREST_DATE, label = TRUE, abbr = TRUE)
  )

3.1 Demographic Perspectives on Arrest Statistics

3.1.1 Overall demographic distribution

Code
age_count <- data |>
  group_by(AGE_GROUP) |>
  summarize(Count = n())

plot1 <- ggplot(age_count, aes(x = AGE_GROUP, y = Count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  ggtitle("Number of Arrests by Age group") +
  xlab("Perp Age Group") +
  ylab("Number of Arrests")

gender_counts <- data |>
  group_by(PERP_SEX) |>
  summarize(Count = n()) |>
  mutate(Percentage = Count / sum(Count) * 100,
         Label = paste0(PERP_SEX, " ", round(Percentage, 1), "%"),
         Position = cumsum(Count) - Count / 2)

plot2 <- ggplot(gender_counts, aes(x = 2, y = Count, fill = PERP_SEX)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  labs(title = "Perp Gender") +
  theme_void() +
  scale_fill_manual(values = c("steelblue", "lightblue")) +
  geom_text(aes(x = 3, y = Position, label = Label), color = "black", size = 5) + 
  theme(legend.position = "none") + xlim(0.5, 3)

race_count <- data |>
  group_by(PERP_RACE) |>
  summarize(Count = n()) |>
  arrange(desc(Count))

plot3 <- ggplot(race_count, aes(x = reorder(PERP_RACE, -Count), y = Count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  ggtitle("Number of Arrests by PERP_RACE") +
  xlab("PERP_RACE") +
  ylab("Number of Arrests") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

top_15_ofns_desc <- data |>
  group_by(OFNS_DESC) |>
  summarise(Count = n()) |>
  arrange(desc(Count)) |>
  head(15)

plot4 <- ggplot(top_15_ofns_desc, aes(x = reorder(OFNS_DESC, Count), 
                                      y = Count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(title = "Top 15 Arrest Categories",
       x = "Offense Description", y = "Number of Arrests") +
  coord_flip()
layout <- "\nABD\nCCD"
(plot1 + plot2 + plot3) + plot4 + plot_layout(design = layout, widths = c(1, 1))

This plot provides a comprehensive overview of demographic patterns and crime-related data. - The top-left plot displays the distribution of arrests by age group, revealing that individuals aged 25-44 represent the highest proportion of arrests, followed by the 18-24 and 45-64 age groups. - The pie chart at the top-right illustrates the gender distribution of arrests, with males accounting for a dominant 82.1% of all arrests, compared to 17.9% for females. - The bottom-left plot focuses on the perpetrator’s race description, showing that Black individuals are most frequently arrested, followed by White Hispanics, Black Hispanics, and other racial categories. - Finally, the bar chart on the bottom-right lists the top 15 offense categories, where Assault 3 & Related Offenses leads, followed by Petit Larceny and Felony Assault.

For the demographic section, it is important to note that the racial categories “Unknown” and “American Indian/Alaskan Native” represent a very small proportion of the overall arrests. Therefore, these groups are excluded from the analysis to focus on the more predominant racial categories.

The demographic distribution chart offers a foundational understanding of the patterns observed in arrests. However, to delve deeper into the interplay between these demographic factors and their combined influence on arrest patterns, the alluvial flow chart offers a more dynamic and interconnected perspective. It allows us to visualize how different demographic categories, such as gender and race, flow into specific age groups, revealing hidden relationships within the data.

3.1.2 Flow of Arrests by Demographics

Code
alluvia_data <- na.omit(data) |> 
  filter(!(PERP_RACE %in% c("UNKNOWN", "AMERICAN INDIAN/ALASKAN NATIVE"))) |>
  group_by(PERP_SEX, PERP_RACE, AGE_GROUP) |>
  summarise(Count = n(), .groups = "drop") |>
  rename(Sex = PERP_SEX, Race = PERP_RACE, AgeGroup = AGE_GROUP)

ggplot(alluvia_data, aes(axis1 = Sex, axis2 = Race, 
                         axis3 = AgeGroup, y = Count)) +
  geom_alluvium(aes(fill = Race), width = 0.2, alpha = 0.8) +
  geom_stratum(width = 0.2, fill = "lightgrey", color = "black") +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 2) +
  scale_x_discrete(limits = c("Sex", "Race", "AgeGroup"), expand = c(0.1, 0.1)) +
  theme_minimal() +
  scale_fill_brewer(type='qual') + 
  labs(title = "Flow of Arrests by Demographics",
       x = "Demographic Categories",
       y = "Number of Arrests", fill = "Race")

This Alluvia diagram clearly shows a dominant overrepresentation of Black and white hispanic males, especially in the 25-44 age group, in arrest data. This trend, combined with the overwhelmingly higher number of male arrests across all races, suggests that certain demographics are being disproportionately affected. In contrast, smaller flows from groups like Asian/Pacific Islanders stand out, emphasizing how arrests are concentrated among specific racial and gender groups. These patterns raise important questions about potential systemic issues or societal factors that may be driving these disparities.

One significant influence on this disparities may be the areas where individuals reside, as different boroughs often reflect varying living standards, access to resources, and community dynamics. To explore this further, the following section will delve into the arrest distribution across different boroughs, investigating on how geography might play a role in these demographic trends.

3.1.3 Mosaic Plot: Borough, Race, and Offense Level

Code
mosaic_data <- data |> filter(!is.na(LAW_CAT_CD)) |> 
  filter(!(PERP_RACE %in% c("UNKNOWN", "AMERICAN INDIAN/ALASKAN NATIVE"))) |>
  mutate(LAW_CAT_CD = factor(LAW_CAT_CD, levels = c("Violation", "Misdemeanor", "Felony")))

fill_colors <- RColorBrewer::brewer.pal(3, "Blues")
mosaic(LAW_CAT_CD ~ ARREST_BORO + PERP_RACE, data = mosaic_data,
       labeling_args = list(
         set_varnames = c(
           ARREST_BORO = "Borough", 
           PERP_RACE = "Race", 
           LAW_CAT_CD = "Offense Level"
         ),
         rot_labels = c(0, 0, 90), 
         gp_labels = list(LAW_CAT_CD = gpar(fontsize = 7),
                          PERP_RACE = gpar(fontsize = 7))),
       direction = c("v","h","h"),
       highlighting_fill = fill_colors,
       main = "Mosaic Plot: Borough, Race, and Offense Level")

The mosaic plot reveals that while offense levels show consistent patterns across all boroughs - approximately half of the arrests are for felonies, nearly half are for misdemeanors, and violations constitute only a small proportion - there are striking disparities in the racial composition of arrests across boroughs, highlighting that borough is statistically significant to race. For instance, Black individuals dominate arrests in the Brooklyn, with proportions notably higher than in other boroughs, indicating a geographic concentration of demographic composition. In contrast, White individuals are more frequently arrested in Staten Island compared to the others, highlighting localized variations in racial arrest patterns. Similarly, White Hispanic and Black Hispanic individuals exhibit different arrest distributions, with greater white Hispanic representation in Brooklyn and Queens, and greater black Hispanic in Bronx and Manhattan. These variations point to the conclusion that the borough of residence plays a significant role in shaping arrest patterns across racial groups.

For the next section, we aim to find out the overall geographical patterns.

3.2 Spatial Analysis of Arrests in New York City

3.2.1 Arrest Locations in NYC

Code
nyc_sf <- read_sf("new-york-city-boroughs.geojson")
data <- data |>
  filter(Longitude != 0 & Latitude != 0)
arrest_sf <- st_as_sf(data, coords = c("Longitude", "Latitude"), crs = 4326)

borough_arrest_count <- arrest_sf |>
  st_drop_geometry() |>
  group_by(ARREST_BORO) |>
  summarise(total_arrests = n())

nyc_sf <- nyc_sf |>
  left_join(borough_arrest_count, by = c("name" = "ARREST_BORO")) |>
  mutate(tooltip = paste(name, "<br>Total Arrests:", total_arrests))

interactive_map <- ggplot() +
  geom_sf_interactive(data = nyc_sf,
                      aes(fill = name, geometry = geometry, tooltip = tooltip),
                      color = "black", size = 0.3, alpha = 0.5) +
  geom_sf(data = arrest_sf, aes(geometry = geometry),
          color = "red", size = 0.05, alpha = 0.4, stroke = 0.3, shape = 1) +
  labs(title = "Arrest Locations in NYC", 
       x = "Longitude", y = "Latitude", fill = "Borough") +
  coord_sf() +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold"),
        plot.subtitle = element_text(size = 12, face = "italic"),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10),
        plot.margin = margin(1, 1, 1, 1, "cm"))

girafe(ggobj = interactive_map)

Brooklyn leads with the highest number of arrests (53,987), showcasing widespread clusters, particularly in its northern and eastern neighborhoods. These areas may experience higher crime rates due to concentrated urban activity and demographic factors.

Manhattan follows with 46,226 arrests, heavily concentrated in its central and southern regions, including areas like Midtown and Downtown. The high urban density, commercial activity, and large influx of daily commuters likely contribute to this concentration. The Bronx, with 43,896 arrests, shows dense clusters in its central and southern areas, reflecting persistent socio-economic challenges and a relatively high population density.

Queens accounts for 42,969 arrests, characterized by a more dispersed pattern with prominent clusters in neighborhoods like Jamaica and Flushing. The suburban nature of Queens contrasts with the dense urban dynamics of Manhattan and the Bronx. Staten Island, with only 8,341 arrests, has the lowest crime figures, reflecting its smaller population and suburban characteristics, with most arrests concentrated in the northern part of the borough.

Building on the borough-level insights, the precinct-level analysis provides a finer-grained view of arrest distributions across New York City.

3.2.2 Arrests by Police Precinct in NYC

Code
boroughs <- read_sf("new-york-city-boroughs.geojson")
precincts <- read_sf("Police Precincts.geojson")
boroughs <- st_make_valid(boroughs)
precincts <- st_make_valid(precincts)
boroughs <- st_transform(boroughs, crs = st_crs(precincts))
precincts_with_boroughs <- st_join(precincts, boroughs, join = st_intersects)
precincts_with_boroughs <- precincts_with_boroughs |>
  select(precinct, name, geometry) |>
  rename(Borough = name)

precincts_with_boroughs <- precincts_with_boroughs |>
  mutate(precinct = as.integer(precinct))
precinct_arrest_counts <- data |>
  group_by(ARREST_PRECINCT) |>
  summarise(Count = n())
precincts_with_boroughs <- precincts_with_boroughs |>
  left_join(precinct_arrest_counts, by = c("precinct" = "ARREST_PRECINCT"))
precincts_with_boroughs <- precincts_with_boroughs |>
  mutate(tooltip = paste0(
    "<strong>Precinct: </strong>", precinct, "<br>",
    "<strong>Borough: </strong>", Borough, "<br>",
    "<strong>Number of Arrests: </strong>", Count
  ))

pal <- colorNumeric(
  palette = "Reds", 
  domain = precincts_with_boroughs$Count
)
leaflet(precincts_with_boroughs) |>
  addProviderTiles("OpenStreetMap") |>
  addPolygons(fillColor = ~pal(Count), weight = 1, opacity = 1, color = "white",
              dashArray = "3", fillOpacity = 0.7, 
              highlight = highlightOptions(
                weight = 5, color = "#666", dashArray = "", 
                fillOpacity = 0.7,bringToFront = TRUE),
              label = ~paste0("Precinct: ", precinct, "<br>", 
                              "Borough: ", Borough, "<br>",
                              "Number of Arrests: ", Count)) |>
  leaflet::addLegend(pal = pal, values = precincts_with_boroughs$Count, opacity = 0.7,
            title = "Number of Arrests", position = "topright")

The precinct map highlights significant variation in arrest numbers within boroughs, revealing areas of concentrated activity. Notably, precincts in Brooklyn and Manhattan exhibit some of the highest arrest counts, with certain northern and central precincts in Brooklyn and Manhattan dominating the map. These areas align with dense urban neighborhoods characterized by high population and commercial activity. In contrast, precincts in Staten Island consistently show the lowest arrest counts, reinforcing its suburban character. The map also highlights pockets of high arrests in Queens and the Bronx, particularly in precincts associated with well-known urban hubs like Jamaica and the South Bronx. This spatial breakdown underscores the importance of precinct-level data in understanding localized crime dynamics, enabling targeted interventions and resource allocation.

3.3 Temporal Patterns of Crime in NYC

3.3.1 Daily NYPD Arrests

Code
daily_data <- data |>
  group_by(ARREST_DATE) |>
  summarise(Count = n(), .groups = "drop")
  
ggplot(daily_data, aes(x = ARREST_DATE, y = Count)) +
  geom_line(linewidth = 0.8, alpha = 0.8) +
  labs(title = "Daily NYPD Arrests",
       subtitle = "From January 1st to September 30th",
       x = "Date", y = "Number of Arrests", color = "Borough") +
  scale_x_date(date_breaks = "2 week", date_labels = "%b %d") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold"),
        plot.subtitle = element_text(size = 12),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))

The total number of arrests remains relatively stable throughout the nine-month period. There are no drastic upward or downward shifts that would indicate a significant change in arrest rates across months. This means that there is no seasonal patterns being observed, so arrests are not significantly influenced by changes in weather or seasonal events during this time period. Additionally, the number of arrests fluctuates significantly throughout the week, with regular peaks followed by dips. The highest arrest counts tend to occur early in the week, around Mondays or Tuesdays, and gradually decline toward the weekends. To better understand the factors driving these fluctuations, the next stage of the analysis will focus on examining arrests by specific weekdays. This deeper investigation will help uncover how arrest patterns vary across the week and provide insights into the underlying dynamics shaping these trends.

3.3.2 Daily NYPD Arrests by Weekday

Code
weekday_summary <- data |>
  group_by(ARREST_DATE, Weekday) |>
  summarise(Count = n(), .groups = "drop")  

ggplot(weekday_summary, aes(x = ARREST_DATE, y = Count, 
                            group = Weekday, fill = Count)) +
  geom_area(alpha = 0.8) +  
  geom_line(linewidth = 1, color = "black") +  
  facet_grid(Weekday ~ ., scales = "fixed", switch = "y") +  
  scale_fill_gradient(low = "lightblue", high = "darkblue", 
                      name = "Arrest Count") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  labs(title = "Daily NYPD Arrests by Weekday",
       x = "Date", y = "Number of Arrests") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1), 
        strip.text.y.left = element_text(angle = 0),      
        strip.placement = "outside", legend.position = "right")

The graph visualizes daily NYPD arrests by weekday over a nine-month period, with each weekday represented by a separate facet. The color gradient, ranging from light blue to dark blue, reflects the arrest counts for each day, providing an additional layer of information to highlight fluctuations. While the Y-axis shows the count of arrests, the fluctuations within each line can be subtle, especially since the scale is fixed across all weekdays. Including the color gradient helps emphasize variations in arrest counts more clearly, even for weekdays with smaller changes. The patterns reveal that weekdays like Tuesday and Wednesday have consistently higher arrest counts, while weekends, particularly Saturday and Sunday, show significantly lower counts. This supports the observation of a strong weekly cycle in arrest activity, with midweek peaks and weekend lulls.

Building on the weekday-specific arrest patterns, it is essential to understand how arrests vary across NYC boroughs over time.

3.3.3 Daily NYPD Arrests by Borough

Code
daily_summary_by_borough <- data |>
  group_by(ARREST_DATE, ARREST_BORO) |>
  summarise(Count = n(), .groups = "drop")

daily_arrest_data <- daily_summary_by_borough |>
  select(ARREST_DATE, ARREST_BORO, Count) |>
  rename(date = ARREST_DATE, borough = ARREST_BORO, count = Count)
write_json(daily_arrest_data, "daily_arrest_data.json", pretty = TRUE)

ggplot(daily_summary_by_borough, aes(x = ARREST_DATE, y = Count, color = ARREST_BORO)) +
  geom_line(linewidth = 0.8, alpha = 0.8) +
  labs(
    title = "Daily NYPD Arrests by Borough",
    subtitle = "From January 1st to September 30th",
    x = "Date",
    y = "Number of Arrests",
    color = "Borough"
  ) +
  scale_x_date(date_breaks = "2 week", date_labels = "%b %d") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

This chart highlights how arrest counts fluctuate over time across the five NYC boroughs. Staten Island consistently exhibits the lowest arrest counts, with a relatively stable pattern that never exceeds 100 arrests per day. In contrast, the Bronx, Brooklyn, Manhattan, and Queens display overlapping trends with higher daily counts, making it challenging to differentiate between these boroughs. Therefore, it is necessary to create a boxplot to better capture and compare the overall distribution and variation of daily arrest counts across the five boroughs.

3.3.4 Weekly NYPD Arrests by Borough

Code
weekly_summary_boxplot <- data |>
  mutate(Week = floor_date(ARREST_DATE, unit = "week")) |>  
  group_by(Week, ARREST_BORO) |>                           
  summarise(Count = n(), .groups = "drop") |> 
  filter(Week < max(Week))

ggplot(daily_summary_by_borough, aes(x = reorder(ARREST_BORO, -Count, FUN = median), 
                                     y = Count, fill = ARREST_BORO)) +
  geom_boxplot(outlier.shape = 16, alpha = 0.7, outlier.size = 2) + 
  labs(title = "Distribution of Daily NYPD Arrests by Borough",
       subtitle = "From January 1st to September 30th",
       x = "Borough", y = "Number of Daily Arrests", fill = "Borough") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold"),
        plot.subtitle = element_text(size = 12),
        axis.text.x = element_text(size = 12),
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))

Brooklyn stands out with the highest median daily arrests and a wider interquartile range, indicating more variability in arrest counts compared to other boroughs. This aligns with its larger population and higher levels of urban activity. Manhattan follows with a slightly lower median but still maintains a wide IQR, indicating variability likely influenced by high urban density and commercial activity. The Bronx and Queens display similar medians, though the Bronx exhibits a slightly narrower IQR, suggesting more consistency in daily arrest figures. Staten Island, on the other hand, shows significantly lower arrests with a tight interquartile range, highlighting its smaller population and suburban characteristics.